clinical_data <-
read_tsv(
file = "../data/clinical_data_clean.tsv")
## Rows: 1904 Columns: 19
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): cancer_type_detailed, type_of_breast_surgery, cellularity, P50CL_Subtype, TGC_Subtype, her2_status, er_statu...
## dbl (10): age_at_diagnosis, chemotherapy, radio_therapy, hormone_therapy, lymph_nodes_examined_positive, neoplasm_hist...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_expression <-
read_tsv(
file = "../data/gene_expression.tsv")
## Rows: 1904 Columns: 489
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (489): brca1, brca2, palb2, pten, tp53, atm, cdh1, chek2, nbn, nf1, stk11, bard1, mlh1, msh2, msh6, pms2, epcam, r...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_expression_aug <-
read_tsv(
file = "../data/gene_expression_aug.tsv")
## Rows: 931056 Columns: 3
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene
## dbl (2): overall_survival, expr_level
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
In this analysis we would like to perform a PCA analysis on the genes to explore clusters, trends etc. Moreover, we would like to fit a linear regression model to each of the genes.
Fitting PCA to the gene expression data, centering the data and scaling it for normalization.
pca_fit <-
gene_expression |>
prcomp(center = TRUE,
scale = TRUE)
Utilizing the tidy function to extract the eigenvalues from each PC component, slicing the 10 first rows as these are the only interesting, and plotting each component and variance explained as a bar chart.
plot_variance <-
pca_fit |>
tidy(matrix = "eigenvalues") |>
slice(1:10) |>
ggplot(aes(PC,
percent)) +
geom_col(fill = "blue",
alpha = 0.7) +
scale_x_continuous(breaks = 1:10) +
scale_y_continuous(labels = scales::percent_format())
plot_variance
Using augment function to add the clinical data variables to the existing data frame, and plotting the first and second component.
PCA_plot <-
pca_fit |>
augment(clinical_data) |>
ggplot(aes(.fittedPC1,
.fittedPC2)) +
geom_point(size = 0.5) +
labs(x = "Principal Component 1",
y = "Principal Component 2") +
theme(legend.position="right")
Coloring the PCA according to clinical data
PCA_er_status <-
PCA_plot +
aes(color = er_status)
PCA_her2_status <-
PCA_plot +
aes(color = her2_status)
PCA_pr_status <-
PCA_plot +
aes(color = pr_status)
PCA_Chemotherapy <-
PCA_plot +
aes(color = factor(chemotherapy)) +
labs(color = "Chemotherapy")
PCA_Hormonetherapy <-
PCA_plot +
aes(color = factor(hormone_therapy)) +
labs(color = "Hormone therapy")
PCA_Radiotherapy <-
PCA_plot +
aes(color = factor(radio_therapy)) +
labs(color = "Radio therapy")
PCA_survival <-
PCA_plot +
aes(color = factor(overall_survival)) +
labs(color = "Overall Survival")
Using patchwork to combine the plots into two plots
plot_status <-
PCA_er_status + PCA_her2_status / PCA_pr_status
plot_status
plot_therapy <-
PCA_Chemotherapy + PCA_Hormonetherapy / PCA_Radiotherapy
plot_therapy
PCA_survival
Firstly, the data is grouped by gene and outcome nested into one variable.
gene_expression_aug <-
gene_expression_aug |>
group_by(gene) |>
nest() |>
ungroup()
Letting R know we are working by gene
gene_expression_aug <-
gene_expression_aug |>
group_by(gene)
Next, we will use the map() function to apply a linear regression model to each gene, with gene_expression vs. overall_survival outcome.
gene_expression_aug <-
gene_expression_aug |>
mutate(model_object = map(.x = data,
.f = ~lm(formula = expr_level ~ overall_survival,
data = .x)))
gene_expression_aug <-
gene_expression_aug |>
mutate(model_object_tidy = map(model_object,
~ tidy(.x,
conf.int = TRUE,
conf.level = 0.95)))
gene_expression_aug |>
sample_n(1)
Next, we would like to un-nest the model_object_tidy to reveal the variables and values.
gene_expression_aug <-
gene_expression_aug |>
unnest(model_object_tidy)
After un-nesting the data, we would like to only keep gene name, p.value, estimate and confidence level
gene_estimates <-
gene_expression_aug |>
filter(term == "overall_survival") |>
select(gene,
p.value,
estimate,
conf.low,
conf.high) |>
ungroup()
Next, we would like to evaluate if the gene is considered significant to death or not
gene_estimates <-
gene_estimates |>
mutate(
q.value = p.adjust(p.value),
is_significant = case_when(
q.value < 0.05 ~ "yes",
TRUE ~ "no"
)
)
gene_estimates_significant <-
gene_estimates |>
filter(is_significant == "yes") |>
mutate(gene = fct_reorder(gene, estimate))
plot_signif_gene <-
gene_estimates_significant |>
ggplot(aes(x = estimate,
y = gene)) +
geom_point(aes(x = estimate),
size = 2) +
geom_errorbarh(aes(xmin = conf.low,
xmax = conf.high),
height = 0.2) +
geom_vline(xintercept = 0) +
theme(plot.title = element_text(size = 20,
hjust = 0.5),
axis.text.y = element_text(size = 14),
axis.text.x = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16)) +
labs(title = "Significant Up/Down regulated genes associated with death of cancer",
y = "Genes",
x = "Estimate")
plot_signif_gene
ggsave("06_PCA_variance.png",
path = "../doc/images/",
plot = plot_variance,
width = 8,
height = 5)
ggsave("06_PCA_status.png",
path = "../doc/images/",
plot = plot_status,
width = 8,
height = 5)
ggsave("06_PCA_therapy.png",
path = "../doc/images/",
plot = plot_therapy,
width = 8,
height = 5)
ggsave("06_signif_genes.png",
path = "../doc/images/",
plot = plot_signif_gene,
width = 20,
height = 20)
ggsave("06_PCA_survival.png",
path = "../doc/images/",
plot = PCA_survival,
width = 8,
height = 5)